import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import (
classification_report,
confusion_matrix,
recall_score,
accuracy_score,
precision_score,
f1_score,
)
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
# to build logstic regression model
from sklearn.linear_model import LogisticRegression
# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
# to create k folds of data and get cross validation score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# To do one-hot encoding
from sklearn.preprocessing import OneHotEncoder
# To undersample and oversample the data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# to ignore warnings
import warnings
warnings.filterwarnings('ignore')
!pip install shap
Requirement already satisfied: shap in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (0.41.0) Requirement already satisfied: scikit-learn in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from shap) (1.0.2) Requirement already satisfied: tqdm>4.25.0 in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from shap) (4.64.0) Requirement already satisfied: slicer==0.0.7 in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from shap) (0.0.7) Requirement already satisfied: cloudpickle in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from shap) (2.0.0) Requirement already satisfied: pandas in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from shap) (1.4.2) Requirement already satisfied: numpy in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from shap) (1.21.5) Requirement already satisfied: numba in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from shap) (0.55.1) Requirement already satisfied: packaging>20.9 in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from shap) (21.3) Requirement already satisfied: scipy in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from shap) (1.7.3) Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from packaging>20.9->shap) (3.0.4) Requirement already satisfied: colorama in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from tqdm>4.25.0->shap) (0.4.4) Requirement already satisfied: setuptools in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from numba->shap) (61.2.0) Requirement already satisfied: llvmlite<0.39,>=0.38.0rc1 in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from numba->shap) (0.38.0) Requirement already satisfied: pytz>=2020.1 in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from pandas->shap) (2021.3) Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from pandas->shap) (2.8.2) Requirement already satisfied: six>=1.5 in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from python-dateutil>=2.8.1->pandas->shap) (1.16.0) Requirement already satisfied: joblib>=0.11 in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from scikit-learn->shap) (1.1.0) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\paige\anaconda3\envs\utagl_5\lib\site-packages (from scikit-learn->shap) (2.2.0)
df = pd.read_csv("BankChurners.csv")
data = df.copy()
data.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | ... | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
5 rows × 21 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
#number of rows and columns
data.shape
(10127, 21)
# check for null values
data.isnull().sum()
CLIENTNUM 0 Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
#number of unique values in each column
data.nunique()
CLIENTNUM 10127 Attrition_Flag 2 Customer_Age 45 Gender 2 Dependent_count 6 Education_Level 6 Marital_Status 3 Income_Category 6 Card_Category 4 Months_on_book 44 Total_Relationship_Count 6 Months_Inactive_12_mon 7 Contacts_Count_12_mon 7 Credit_Limit 6205 Total_Revolving_Bal 1974 Avg_Open_To_Buy 6813 Total_Amt_Chng_Q4_Q1 1158 Total_Trans_Amt 5033 Total_Trans_Ct 126 Total_Ct_Chng_Q4_Q1 830 Avg_Utilization_Ratio 964 dtype: int64
#Check for duplicates
data[data.duplicated()].count()
CLIENTNUM 0 Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
#Understand the values of column that contains nulls
data['Education_Level'].value_counts()
Graduate 3128 High School 2013 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64
#Understand the values of column that contains nulls
data['Marital_Status'].value_counts()
Married 4687 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64
#Understand the values in each column that is categorical (regardless of type)
cat_cols = ['Attrition_Flag','Gender', 'Dependent_count','Education_Level','Marital_Status','Income_Category','Card_Category','Total_Relationship_Count','Months_Inactive_12_mon','Contacts_Count_12_mon']
for col in cat_cols:
print(data[col].value_counts())
print("_________________________________")
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64 _________________________________ F 5358 M 4769 Name: Gender, dtype: int64 _________________________________ 3 2732 2 2655 1 1838 4 1574 0 904 5 424 Name: Dependent_count, dtype: int64 _________________________________ Graduate 3128 High School 2013 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 _________________________________ Married 4687 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64 _________________________________ Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 abc 1112 $120K + 727 Name: Income_Category, dtype: int64 _________________________________ Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64 _________________________________ 3 2305 4 1912 5 1891 6 1866 2 1243 1 910 Name: Total_Relationship_Count, dtype: int64 _________________________________ 3 3846 2 3282 1 2233 4 435 5 178 6 124 0 29 Name: Months_Inactive_12_mon, dtype: int64 _________________________________ 3 3380 2 3227 1 1499 4 1392 0 399 5 176 6 54 Name: Contacts_Count_12_mon, dtype: int64 _________________________________
# Calculate summary stats for numerical columns that are not categories
num_cols = ['Customer_Age','Months_on_book','Credit_Limit','Total_Revolving_Bal','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1','Total_Trans_Amt','Total_Trans_Ct','Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio']
data[num_cols].describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Customer_Age | 10127.0 | 46.325960 | 8.016814 | 26.0 | 41.000 | 46.000 | 52.000 | 73.000 |
| Months_on_book | 10127.0 | 35.928409 | 7.986416 | 13.0 | 31.000 | 36.000 | 40.000 | 56.000 |
| Credit_Limit | 10127.0 | 8631.953698 | 9088.776650 | 1438.3 | 2555.000 | 4549.000 | 11067.500 | 34516.000 |
| Total_Revolving_Bal | 10127.0 | 1162.814061 | 814.987335 | 0.0 | 359.000 | 1276.000 | 1784.000 | 2517.000 |
| Avg_Open_To_Buy | 10127.0 | 7469.139637 | 9090.685324 | 3.0 | 1324.500 | 3474.000 | 9859.000 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 10127.0 | 0.759941 | 0.219207 | 0.0 | 0.631 | 0.736 | 0.859 | 3.397 |
| Total_Trans_Amt | 10127.0 | 4404.086304 | 3397.129254 | 510.0 | 2155.500 | 3899.000 | 4741.000 | 18484.000 |
| Total_Trans_Ct | 10127.0 | 64.858695 | 23.472570 | 10.0 | 45.000 | 67.000 | 81.000 | 139.000 |
| Total_Ct_Chng_Q4_Q1 | 10127.0 | 0.712222 | 0.238086 | 0.0 | 0.582 | 0.702 | 0.818 | 3.714 |
| Avg_Utilization_Ratio | 10127.0 | 0.274894 | 0.275691 | 0.0 | 0.023 | 0.176 | 0.503 | 0.999 |
#convert data type of category columns
data[cat_cols] = data[cat_cols].astype('category')
# From Anime Rating Prediction case study, function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# Distribution of customer age
histogram_boxplot(data, 'Customer_Age')
data.groupby(['Attrition_Flag'])['Customer_Age'].median()
Attrition_Flag Attrited Customer 47.0 Existing Customer 46.0 Name: Customer_Age, dtype: float64
# Distribution of months on book
histogram_boxplot(data, 'Months_on_book')
data.groupby(['Attrition_Flag'])['Months_on_book'].median()
Attrition_Flag Attrited Customer 36.0 Existing Customer 36.0 Name: Months_on_book, dtype: float64
# Distribution of credit limit
histogram_boxplot(data, 'Credit_Limit')
data.groupby(['Attrition_Flag'])['Credit_Limit'].median()
Attrition_Flag Attrited Customer 4178.0 Existing Customer 4643.5 Name: Credit_Limit, dtype: float64
# Distribution of total revolving balance
histogram_boxplot(data, 'Total_Revolving_Bal')
data.groupby(['Attrition_Flag'])['Total_Revolving_Bal'].median()
Attrition_Flag Attrited Customer 0.0 Existing Customer 1364.0 Name: Total_Revolving_Bal, dtype: float64
# Distribution of average open to buy
histogram_boxplot(data, 'Avg_Open_To_Buy')
data.groupby(['Attrition_Flag'])['Avg_Open_To_Buy'].median()
Attrition_Flag Attrited Customer 3488.0 Existing Customer 3469.5 Name: Avg_Open_To_Buy, dtype: float64
# Distribution of Total amount change
histogram_boxplot(data, 'Total_Amt_Chng_Q4_Q1')
data.groupby(['Attrition_Flag'])['Total_Amt_Chng_Q4_Q1'].median()
Attrition_Flag Attrited Customer 0.701 Existing Customer 0.743 Name: Total_Amt_Chng_Q4_Q1, dtype: float64
# Distribution of Total transaction amount
histogram_boxplot(data, 'Total_Trans_Amt')
data.groupby(['Attrition_Flag'])['Total_Trans_Amt'].median()
Attrition_Flag Attrited Customer 2329.0 Existing Customer 4100.0 Name: Total_Trans_Amt, dtype: float64
# Distribution of total transaction count
histogram_boxplot(data, 'Total_Trans_Ct')
data.groupby(['Attrition_Flag'])['Total_Trans_Ct'].median()
Attrition_Flag Attrited Customer 43.0 Existing Customer 71.0 Name: Total_Trans_Ct, dtype: float64
# Distribution of total count change
histogram_boxplot(data, 'Total_Ct_Chng_Q4_Q1')
data.groupby(['Attrition_Flag'])['Total_Ct_Chng_Q4_Q1'].median()
Attrition_Flag Attrited Customer 0.531 Existing Customer 0.721 Name: Total_Ct_Chng_Q4_Q1, dtype: float64
# Distribution of average utilization ratio
histogram_boxplot(data, 'Avg_Utilization_Ratio')
data.groupby(['Attrition_Flag'])['Avg_Utilization_Ratio'].median()
Attrition_Flag Attrited Customer 0.000 Existing Customer 0.211 Name: Avg_Utilization_Ratio, dtype: float64
#Explore numerical variable relationships and multi-collinearity
plt.figure(figsize=(15, 7))
sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
Consider dropping variables based on multi collinearity
sns.catplot(data=data, kind='count',x='Gender', hue='Attrition_Flag')
<seaborn.axisgrid.FacetGrid at 0x192dda7da90>
data['Gender'].value_counts(1)*100
F 52.908068 M 47.091932 Name: Gender, dtype: float64
data.groupby(['Attrition_Flag'])['Gender'].value_counts(1)*100
Attrition_Flag
Attrited Customer F 57.160418
M 42.839582
Existing Customer F 52.094118
M 47.905882
Name: Gender, dtype: float64
sns.catplot(data=data, kind='count',x='Dependent_count', hue='Attrition_Flag')
<seaborn.axisgrid.FacetGrid at 0x192de1fb250>
data['Dependent_count'].value_counts(1)*100
3 26.977387 2 26.217044 1 18.149501 4 15.542609 0 8.926632 5 4.186827 Name: Dependent_count, dtype: float64
data.groupby(['Attrition_Flag'])['Dependent_count'].value_counts(1)*100
Attrition_Flag
Attrited Customer 3 29.625077
2 25.629994
1 16.533497
4 15.980332
0 8.297480
5 3.933620
Existing Customer 3 26.470588
2 26.329412
1 18.458824
4 15.458824
0 9.047059
5 4.235294
Name: Dependent_count, dtype: float64
sns.catplot(data=data, kind='count',x='Education_Level', hue='Attrition_Flag')
<seaborn.axisgrid.FacetGrid at 0x192dda7d910>
data['Education_Level'].value_counts(1)*100
Graduate 36.338290 High School 23.385223 Uneducated 17.274628 College 11.768123 Post-Graduate 5.994424 Doctorate 5.239312 Name: Education_Level, dtype: float64
data.groupby(['Attrition_Flag'])['Education_Level'].value_counts(1)*100
Attrition_Flag
Attrited Customer Graduate 35.521517
High School 22.319475
Uneducated 17.286652
College 11.232677
Doctorate 6.929249
Post-Graduate 6.710430
Existing Customer Graduate 36.493022
High School 23.587122
Uneducated 17.272350
College 11.869559
Post-Graduate 5.858781
Doctorate 4.919165
Name: Education_Level, dtype: float64
data.groupby(['Education_Level'])['Customer_Age'].median()
Education_Level College 46.0 Doctorate 47.0 Graduate 46.0 High School 46.0 Post-Graduate 47.0 Uneducated 46.0 Name: Customer_Age, dtype: float64
data.groupby(['Education_Level'])['Income_Category'].value_counts(1)*100
Education_Level
College Less than $40K 34.057256
$40K - $60K 18.065153
$80K - $120K 17.275420
$60K - $80K 13.030602
abc 10.661402
$120K + 6.910168
Doctorate Less than $40K 35.033259
$40K - $60K 15.521064
abc 15.521064
$60K - $80K 13.082040
$80K - $120K 12.638581
$120K + 8.203991
Graduate Less than $40K 36.413043
$40K - $60K 17.679028
$80K - $120K 15.281330
$60K - $80K 13.491049
abc 10.613811
$120K + 6.521739
High School Less than $40K 33.333333
$40K - $60K 17.635370
$80K - $120K 15.300546
$60K - $80K 15.250869
abc 11.177347
$120K + 7.302534
Post-Graduate Less than $40K 32.945736
$40K - $60K 21.511628
$80K - $120K 15.697674
$60K - $80K 14.922481
abc 9.108527
$120K + 5.813953
Uneducated Less than $40K 35.104237
$40K - $60K 16.745124
$80K - $120K 14.593141
$60K - $80K 13.113652
abc 12.441157
$120K + 8.002690
Name: Income_Category, dtype: float64
data.groupby(['Income_Category'])['Education_Level'].value_counts(1)*100
Income_Category
$120K + Graduate 33.607908
High School 24.217463
Uneducated 19.604613
College 11.532125
Doctorate 6.095552
Post-Graduate 4.942339
$40K - $60K Graduate 36.357659
High School 23.339908
Uneducated 16.370809
College 12.031558
Post-Graduate 7.297830
Doctorate 4.602235
$60K - $80K Graduate 35.402685
High School 25.755034
Uneducated 16.359060
College 11.073826
Post-Graduate 6.459732
Doctorate 4.949664
$80K - $120K Graduate 36.322188
High School 23.404255
Uneducated 16.489362
College 13.297872
Post-Graduate 6.155015
Doctorate 4.331307
Less than $40K Graduate 37.903494
High School 22.329451
Uneducated 17.371048
College 11.480865
Post-Graduate 5.657238
Doctorate 5.257903
abc Graduate 34.332989
High School 23.267839
Uneducated 19.131334
College 11.168563
Doctorate 7.238883
Post-Graduate 4.860393
Name: Education_Level, dtype: float64
sns.catplot(data=data, kind='count',x='Marital_Status', hue='Attrition_Flag')
<seaborn.axisgrid.FacetGrid at 0x192dd944e50>
data['Marital_Status'].value_counts(1)*100
Married 49.978673 Single 42.045212 Divorced 7.976114 Name: Marital_Status, dtype: float64
data.groupby(['Attrition_Flag'])['Marital_Status'].value_counts(1)*100
Attrition_Flag
Attrited Customer Married 47.329773
Single 44.592790
Divorced 8.077437
Existing Customer Married 50.482234
Single 41.560914
Divorced 7.956853
Name: Marital_Status, dtype: float64
data.groupby(['Marital_Status'])['Customer_Age'].median()
Marital_Status Divorced 45.0 Married 47.0 Single 46.0 Name: Customer_Age, dtype: float64
data.groupby(['Income_Category'])['Marital_Status'].value_counts(1)*100
Income_Category
$120K + Married 52.058824
Single 40.294118
Divorced 7.647059
$40K - $60K Married 49.215923
Single 42.460796
Divorced 8.323281
$60K - $80K Married 50.846154
Single 40.846154
Divorced 8.307692
$80K - $120K Married 52.537527
Single 40.100071
Divorced 7.362402
Less than $40K Married 49.169435
Single 43.159166
Divorced 7.671398
abc Married 47.864078
Single 43.106796
Divorced 9.029126
Name: Marital_Status, dtype: float64
data.groupby(['Dependent_count'])['Marital_Status'].value_counts(1)*100
Dependent_count
0 Single 46.341463
Married 44.947735
Divorced 8.710801
1 Married 48.740480
Single 45.108377
Divorced 6.151142
2 Married 51.545564
Single 40.264954
Divorced 8.189482
3 Married 50.180505
Single 41.195347
Divorced 8.624148
4 Married 50.521195
Single 40.792217
Divorced 8.686588
5 Married 53.229974
Single 40.568475
Divorced 6.201550
Name: Marital_Status, dtype: float64
data.groupby(['Total_Relationship_Count'])['Marital_Status'].value_counts(1)*100
Total_Relationship_Count
1 Married 46.450060
Single 45.968712
Divorced 7.581227
2 Married 49.511979
Single 43.034605
Divorced 7.453416
3 Married 51.152074
Single 40.967742
Divorced 7.880184
4 Married 49.408451
Single 42.873239
Divorced 7.718310
5 Married 49.828767
Single 41.038813
Divorced 9.132420
6 Married 51.247824
Single 41.033082
Divorced 7.719095
Name: Marital_Status, dtype: float64
sns.catplot(data=data, kind='count',x='Income_Category', hue='Attrition_Flag')
<seaborn.axisgrid.FacetGrid at 0x192dd813880>
data['Income_Category'].value_counts(1)*100
Less than $40K 35.163425 $40K - $60K 17.675521 $80K - $120K 15.157500 $60K - $80K 13.844179 abc 10.980547 $120K + 7.178829 Name: Income_Category, dtype: float64
data.groupby(['Attrition_Flag'])['Income_Category'].value_counts(1)*100
Attrition_Flag
Attrited Customer Less than $40K 37.615243
$40K - $60K 16.656423
$80K - $120K 14.874001
$60K - $80K 11.616472
abc 11.493546
$120K + 7.744315
Existing Customer Less than $40K 34.694118
$40K - $60K 17.870588
$80K - $120K 15.211765
$60K - $80K 14.270588
abc 10.882353
$120K + 7.070588
Name: Income_Category, dtype: float64
data.groupby(['Income_Category'])['Credit_Limit'].median()
Income_Category $120K + 18442.0 $40K - $60K 3682.0 $60K - $80K 7660.0 $80K - $120K 12830.0 Less than $40K 2766.0 abc 6380.0 Name: Credit_Limit, dtype: float64
sns.catplot(data=data, kind='count',x='Card_Category', hue='Attrition_Flag')
<seaborn.axisgrid.FacetGrid at 0x192de27ec70>
data['Card_Category'].value_counts(1)*100
Blue 93.176656 Silver 5.480399 Gold 1.145453 Platinum 0.197492 Name: Card_Category, dtype: float64
data.groupby(['Attrition_Flag'])['Card_Category'].value_counts(1)*100
Attrition_Flag
Attrited Customer Blue 93.362016
Silver 5.039951
Gold 1.290719
Platinum 0.307314
Existing Customer Blue 93.141176
Silver 5.564706
Gold 1.117647
Platinum 0.176471
Name: Card_Category, dtype: float64
sns.catplot(data=data, kind='count',x='Total_Relationship_Count', hue='Attrition_Flag')
<seaborn.axisgrid.FacetGrid at 0x192de1b0eb0>
data['Total_Relationship_Count'].value_counts(1)*100
3 22.760936 4 18.880221 5 18.672855 6 18.425990 2 12.274119 1 8.985879 Name: Total_Relationship_Count, dtype: float64
data.groupby(['Attrition_Flag'])['Total_Relationship_Count'].value_counts(1)*100
Attrition_Flag
Attrited Customer 3 24.585126
2 21.266134
1 14.320836
5 13.952059
4 13.829133
6 12.046712
Existing Customer 3 22.411765
4 19.847059
6 19.647059
5 19.576471
2 10.552941
1 7.964706
Name: Total_Relationship_Count, dtype: float64
sns.catplot(data=data, kind='count',x='Months_Inactive_12_mon', hue='Attrition_Flag')
<seaborn.axisgrid.FacetGrid at 0x192dd74ac40>
data['Months_Inactive_12_mon'].value_counts(1)*100
3 37.977683 2 32.408413 1 22.049965 4 4.295448 5 1.757677 6 1.224449 0 0.286363 Name: Months_Inactive_12_mon, dtype: float64
data.groupby(['Attrition_Flag'])['Months_Inactive_12_mon'].value_counts(1)*100
Attrition_Flag
Attrited Customer 3 50.768285
2 31.038722
4 7.990166
1 6.146281
5 1.966810
6 1.167793
0 0.921942
Existing Customer 3 35.529412
2 32.670588
1 25.094118
4 3.588235
5 1.717647
6 1.235294
0 0.164706
Name: Months_Inactive_12_mon, dtype: float64
sns.catplot(data=data, kind='count',x='Contacts_Count_12_mon', hue='Attrition_Flag')
<seaborn.axisgrid.FacetGrid at 0x192dd7b3b20>
data['Contacts_Count_12_mon'].value_counts(1)*100
3 33.376123 2 31.865311 1 14.802014 4 13.745433 0 3.939962 5 1.737928 6 0.533228 Name: Contacts_Count_12_mon, dtype: float64
data.groupby(['Attrition_Flag'])['Contacts_Count_12_mon'].value_counts(1)*100
Attrition_Flag
Attrited Customer 3 41.856177
2 24.769514
4 19.360787
1 6.637984
5 3.626306
6 3.318992
0 0.430240
Existing Customer 2 33.223529
3 31.752941
1 16.364706
4 12.670588
0 4.611765
5 1.376471
6 0.000000
Name: Contacts_Count_12_mon, dtype: float64
data.groupby(['Contacts_Count_12_mon'])['Attrition_Flag'].value_counts(1)*100
Contacts_Count_12_mon
0 Existing Customer 98.245614
Attrited Customer 1.754386
1 Existing Customer 92.795197
Attrited Customer 7.204803
2 Existing Customer 87.511621
Attrited Customer 12.488379
3 Existing Customer 79.852071
Attrited Customer 20.147929
4 Existing Customer 77.370690
Attrited Customer 22.629310
5 Existing Customer 66.477273
Attrited Customer 33.522727
6 Attrited Customer 100.000000
Existing Customer 0.000000
Name: Attrition_Flag, dtype: float64
# drop the following columns: Avg_UtilizationRate (multi-collinearity with 3 other columns); Education (missing 10% of data points without a strategy for null replacement); clientnum (sequence column -same as index)
data = data.drop(['CLIENTNUM','Education_Level','Avg_Utilization_Ratio'], axis = 1)
#make a copy of data before feture engineering
data2=data.copy()
sns.pairplot(data , hue='Attrition_Flag' , diag_kind = 'kde',diag_kws=dict(fill=False))
plt.show()
# Drop Avg Open to buy -multicollinearity with Credit limit
data = data.drop(['Avg_Open_To_Buy'], axis = 1)
# Drop Total transaction Amount -multicollinearity with Total transaction count
data = data.drop(['Total_Trans_Amt'], axis = 1)
data['Income_Category'].value_counts(1)*100
Less than $40K 35.163425 $40K - $60K 17.675521 $80K - $120K 15.157500 $60K - $80K 13.844179 abc 10.980547 $120K + 7.178829 Name: Income_Category, dtype: float64
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null category 4 Marital_Status 9378 non-null category 5 Income_Category 10127 non-null category 6 Card_Category 10127 non-null category 7 Months_on_book 10127 non-null int64 8 Total_Relationship_Count 10127 non-null category 9 Months_Inactive_12_mon 10127 non-null category 10 Contacts_Count_12_mon 10127 non-null category 11 Credit_Limit 10127 non-null float64 12 Total_Revolving_Bal 10127 non-null int64 13 Total_Amt_Chng_Q4_Q1 10127 non-null float64 14 Total_Trans_Ct 10127 non-null int64 15 Total_Ct_Chng_Q4_Q1 10127 non-null float64 dtypes: category(9), float64(3), int64(4) memory usage: 644.9 KB
data['Income_Category'] = data['Income_Category'].astype('object')
data['Income_Category']=data['Income_Category'].replace('abc', np.nan)
data['Income_Category'] = data['Income_Category'].replace(np.nan, 'Less than $40K')
data['Marital_Status'] = data['Marital_Status'].astype('object')
data['Marital_Status'] = data['Marital_Status'].replace(np.nan, 'Married')
data['Marital_Status'].value_counts()
Married 5436 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64
data['Income_Category'].value_counts(1)*100
Less than $40K 46.143972 $40K - $60K 17.675521 $80K - $120K 15.157500 $60K - $80K 13.844179 $120K + 7.178829 Name: Income_Category, dtype: float64
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null category 4 Marital_Status 10127 non-null object 5 Income_Category 10127 non-null object 6 Card_Category 10127 non-null category 7 Months_on_book 10127 non-null int64 8 Total_Relationship_Count 10127 non-null category 9 Months_Inactive_12_mon 10127 non-null category 10 Contacts_Count_12_mon 10127 non-null category 11 Credit_Limit 10127 non-null float64 12 Total_Revolving_Bal 10127 non-null int64 13 Total_Amt_Chng_Q4_Q1 10127 non-null float64 14 Total_Trans_Ct 10127 non-null int64 15 Total_Ct_Chng_Q4_Q1 10127 non-null float64 dtypes: category(7), float64(3), int64(4), object(2) memory usage: 783.0+ KB
# Locate points that are outside of 4 times the IQR
quartiles = np.quantile(data['Customer_Age'][data['Customer_Age'].notnull()], [.25, .75])
power_4iqr = 4 * (quartiles[1] - quartiles[0])
print(f'Q1 = {quartiles[0]}, Q3 = {quartiles[1]}, 4*IQR = {power_4iqr}')
outlier_powers = data.loc[np.abs(data['Customer_Age'] - data['Customer_Age'].median()) > power_4iqr, 'Customer_Age']
outlier_powers
Q1 = 41.0, Q3 = 52.0, 4*IQR = 44.0
Series([], Name: Customer_Age, dtype: int64)
##### MOnths on book
quartiles = np.quantile(data['Months_on_book'][data['Months_on_book'].notnull()], [.25, .75])
power_4iqr = 4 * (quartiles[1] - quartiles[0])
print(f'Q1 = {quartiles[0]}, Q3 = {quartiles[1]}, 4*IQR = {power_4iqr}')
outlier_powers = data.loc[np.abs(data['Months_on_book'] - data['Months_on_book'].median()) > power_4iqr, 'Months_on_book']
outlier_powers
Q1 = 31.0, Q3 = 40.0, 4*IQR = 36.0
Series([], Name: Months_on_book, dtype: int64)
quartiles = np.quantile(data['Credit_Limit'][data['Credit_Limit'].notnull()], [.25, .75])
power_4iqr = 4 * (quartiles[1] - quartiles[0])
print(f'Q1 = {quartiles[0]}, Q3 = {quartiles[1]}, 4*IQR = {power_4iqr}')
outlier_powers = data.loc[np.abs(data['Credit_Limit'] - data['Credit_Limit'].median()) > power_4iqr, 'Credit_Limit']
outlier_powers
Q1 = 2555.0, Q3 = 11067.5, 4*IQR = 34050.0
Series([], Name: Credit_Limit, dtype: float64)
histogram_boxplot(data, 'Total_Amt_Chng_Q4_Q1')
quartiles = np.quantile(data['Total_Amt_Chng_Q4_Q1'][data['Total_Amt_Chng_Q4_Q1'].notnull()], [.25, .75])
power_4iqr = 4 * (quartiles[1] - quartiles[0])
print(f'Q1 = {quartiles[0]}, Q3 = {quartiles[1]}, 4*IQR = {power_4iqr}')
outlier_powers = data.loc[np.abs(data['Total_Amt_Chng_Q4_Q1'] - data['Total_Amt_Chng_Q4_Q1'].median()) > power_4iqr, 'Total_Amt_Chng_Q4_Q1']
outlier_powers
Q1 = 0.631, Q3 = 0.859, 4*IQR = 0.9119999999999999
2 2.594 4 2.175 6 1.975 7 2.204 8 3.355 12 3.397 15 1.707 16 1.708 18 1.831 32 1.726 33 1.750 36 1.724 46 2.316 47 2.357 58 2.275 68 1.715 88 1.932 89 1.702 94 1.705 95 1.656 113 1.674 115 1.826 117 1.873 137 1.689 142 1.996 154 2.121 177 2.053 180 1.791 190 1.768 219 2.368 231 1.727 284 2.145 295 1.859 336 1.707 431 2.023 466 2.271 658 2.282 773 2.675 841 2.180 855 1.867 869 1.924 1085 2.041 1176 1.893 1219 2.103 1369 1.749 1455 1.787 1486 1.800 1570 1.676 1689 1.705 1718 1.769 1873 2.037 1883 1.669 2337 1.743 2565 1.731 3270 1.675 Name: Total_Amt_Chng_Q4_Q1, dtype: float64
quartiles = np.quantile(data['Total_Trans_Ct'][data['Total_Trans_Ct'].notnull()], [.25, .75])
power_4iqr = 4 * (quartiles[1] - quartiles[0])
print(f'Q1 = {quartiles[0]}, Q3 = {quartiles[1]}, 4*IQR = {power_4iqr}')
outlier_powers = data.loc[np.abs(data['Total_Trans_Ct'] - data['Total_Trans_Ct'].median()) > power_4iqr, 'Total_Trans_Ct']
outlier_powers
Q1 = 45.0, Q3 = 81.0, 4*IQR = 144.0
Series([], Name: Total_Trans_Ct, dtype: int64)
quartiles = np.quantile(data['Total_Ct_Chng_Q4_Q1'][data['Total_Ct_Chng_Q4_Q1'].notnull()], [.25, .75])
power_4iqr = 4 * (quartiles[1] - quartiles[0])
print(f'Q1 = {quartiles[0]}, Q3 = {quartiles[1]}, 4*IQR = {power_4iqr}')
outlier_powers = data.loc[np.abs(data['Total_Ct_Chng_Q4_Q1'] - data['Total_Ct_Chng_Q4_Q1'].median()) > power_4iqr, 'Total_Ct_Chng_Q4_Q1']
outlier_powers
Q1 = 0.582, Q3 = 0.818, 4*IQR = 0.944
1 3.714 2 2.333 3 2.333 4 2.500 12 3.250 13 2.000 15 1.700 30 2.571 32 1.667 47 1.700 52 1.923 68 2.400 69 2.000 76 1.875 84 2.000 88 1.800 91 2.182 113 3.000 131 2.200 146 2.875 151 1.909 158 2.429 162 2.167 167 2.286 190 3.000 231 2.000 239 2.273 269 3.500 280 2.400 281 1.667 294 2.083 300 2.000 309 2.100 322 1.750 323 1.875 324 1.875 346 1.833 366 2.750 418 1.778 456 2.000 697 1.857 757 2.222 760 1.778 773 3.571 805 2.500 1041 1.750 1095 2.222 1187 1.647 1256 2.000 1455 2.000 1778 1.700 2099 1.722 2358 1.882 2510 2.500 2565 1.647 2683 1.750 2696 1.923 4146 1.783 4597 1.650 9977 1.684 Name: Total_Ct_Chng_Q4_Q1, dtype: float64
histogram_boxplot(data, 'Total_Ct_Chng_Q4_Q1')
After Test//Train split:Replace some outliers for Total amount change and Total count change with 3rd Q for values greater than 1.8
#Explore numerical variable relationships and multi-collinearity
plt.figure(figsize=(15, 7))
sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
sns.boxplot(data = data, x= 'Customer_Age', y='Attrition_Flag')
<AxesSubplot:xlabel='Customer_Age', ylabel='Attrition_Flag'>
sns.boxplot(data = data, x='Months_on_book', y='Attrition_Flag')
<AxesSubplot:xlabel='Months_on_book', ylabel='Attrition_Flag'>
Neither variable appears to greatly inform the model I am dropping Months on book as its distribution indicates an overabundance of 36 month values
data = data.drop(['Months_on_book'], axis = 1)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null category 4 Marital_Status 10127 non-null object 5 Income_Category 10127 non-null object 6 Card_Category 10127 non-null category 7 Total_Relationship_Count 10127 non-null category 8 Months_Inactive_12_mon 10127 non-null category 9 Contacts_Count_12_mon 10127 non-null category 10 Credit_Limit 10127 non-null float64 11 Total_Revolving_Bal 10127 non-null int64 12 Total_Amt_Chng_Q4_Q1 10127 non-null float64 13 Total_Trans_Ct 10127 non-null int64 14 Total_Ct_Chng_Q4_Q1 10127 non-null float64 dtypes: category(7), float64(3), int64(3), object(2) memory usage: 703.9+ KB
#convert marital Status and Income category back to Category dtype
data['Marital_Status'] = data['Marital_Status'].astype('category')
data['Income_Category'] = data['Income_Category'].astype('category')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null category 4 Marital_Status 10127 non-null category 5 Income_Category 10127 non-null category 6 Card_Category 10127 non-null category 7 Total_Relationship_Count 10127 non-null category 8 Months_Inactive_12_mon 10127 non-null category 9 Contacts_Count_12_mon 10127 non-null category 10 Credit_Limit 10127 non-null float64 11 Total_Revolving_Bal 10127 non-null int64 12 Total_Amt_Chng_Q4_Q1 10127 non-null float64 13 Total_Trans_Ct 10127 non-null int64 14 Total_Ct_Chng_Q4_Q1 10127 non-null float64 dtypes: category(9), float64(3), int64(3) memory usage: 565.7 KB
data['Attrition_Flag'].value_counts()
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64
data['Attrition_Flag'] = data['Attrition_Flag'].replace(['Existing Customer','Attrited Customer'],['0','1'])
data['Attrition_Flag'] = data['Attrition_Flag'].astype('int64')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null int64 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null category 4 Marital_Status 10127 non-null category 5 Income_Category 10127 non-null category 6 Card_Category 10127 non-null category 7 Total_Relationship_Count 10127 non-null category 8 Months_Inactive_12_mon 10127 non-null category 9 Contacts_Count_12_mon 10127 non-null category 10 Credit_Limit 10127 non-null float64 11 Total_Revolving_Bal 10127 non-null int64 12 Total_Amt_Chng_Q4_Q1 10127 non-null float64 13 Total_Trans_Ct 10127 non-null int64 14 Total_Ct_Chng_Q4_Q1 10127 non-null float64 dtypes: category(8), float64(3), int64(4) memory usage: 634.8 KB
data['Attrition_Flag'].value_counts()
0 8500 1 1627 Name: Attrition_Flag, dtype: int64
data['Attrition_Flag'].value_counts(1)*100
0 83.934038 1 16.065962 Name: Attrition_Flag, dtype: float64
# separating the independent and dependent variables
X = data.drop(["Attrition_Flag"], axis=1)
y = data["Attrition_Flag"]
# Splitting data into training, validation and test set: Code from Great Learning materials)
# first we split data into 2 parts, say temporary and test
X_train, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=0.5, random_state=0, stratify=y
)
# then we split the temporary set into train and validation
X_val, X_test, y_val, y_test = train_test_split(
X_temp, y_temp, test_size=0.4, random_state=0, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(5063, 14) (3038, 14) (2026, 14)
#Replace the outliers with Q3 for outliers above the median
X_train['Total_Ct_Chng_Q4_Q1'].mask(X_train['Total_Ct_Chng_Q4_Q1'] > 1.8,X_train['Total_Ct_Chng_Q4_Q1'].quantile(.75), inplace=True)
X_val['Total_Ct_Chng_Q4_Q1'].mask(X_val['Total_Ct_Chng_Q4_Q1'] > 1.8,X_val['Total_Ct_Chng_Q4_Q1'].quantile(.75), inplace=True)
X_test['Total_Ct_Chng_Q4_Q1'].mask(X_test['Total_Ct_Chng_Q4_Q1'] > 1.8,X_test['Total_Ct_Chng_Q4_Q1'].quantile(.75), inplace=True)
X_train['Total_Amt_Chng_Q4_Q1'].mask(X_train['Total_Amt_Chng_Q4_Q1'] > 1.8,X_train['Total_Amt_Chng_Q4_Q1'].quantile(.75), inplace=True)
X_val['Total_Amt_Chng_Q4_Q1'].mask(X_val['Total_Amt_Chng_Q4_Q1'] > 1.8,X_val['Total_Amt_Chng_Q4_Q1'].quantile(.75), inplace=True)
X_test['Total_Amt_Chng_Q4_Q1'].mask(X_test['Total_Amt_Chng_Q4_Q1'] > 1.8,X_test['Total_Amt_Chng_Q4_Q1'].quantile(.75), inplace=True)
# Checking class balance for whole data, train set, validation set, and test set
print("Target value ratio in y")
print(y.value_counts(1))
print("*" * 80)
print("Target value ratio in y_train")
print(y_train.value_counts(1))
print("*" * 80)
print("Target value ratio in y_val")
print(y_val.value_counts(1))
print("*" * 80)
print("Target value ratio in y_test")
print(y_test.value_counts(1))
print("*" * 80)
Target value ratio in y 0 0.83934 1 0.16066 Name: Attrition_Flag, dtype: float64 ******************************************************************************** Target value ratio in y_train 0 0.839423 1 0.160577 Name: Attrition_Flag, dtype: float64 ******************************************************************************** Target value ratio in y_val 0 0.839368 1 0.160632 Name: Attrition_Flag, dtype: float64 ******************************************************************************** Target value ratio in y_test 0 0.839092 1 0.160908 Name: Attrition_Flag, dtype: float64 ********************************************************************************
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5063 entries, 2327 to 5525 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 5063 non-null int64 1 Gender 5063 non-null category 2 Dependent_count 5063 non-null category 3 Marital_Status 5063 non-null category 4 Income_Category 5063 non-null category 5 Card_Category 5063 non-null category 6 Total_Relationship_Count 5063 non-null category 7 Months_Inactive_12_mon 5063 non-null category 8 Contacts_Count_12_mon 5063 non-null category 9 Credit_Limit 5063 non-null float64 10 Total_Revolving_Bal 5063 non-null int64 11 Total_Amt_Chng_Q4_Q1 5063 non-null float64 12 Total_Trans_Ct 5063 non-null int64 13 Total_Ct_Chng_Q4_Q1 5063 non-null float64 dtypes: category(8), float64(3), int64(3) memory usage: 318.2 KB
X_train = pd.get_dummies(X_train, drop_first=True)
X_train.shape
(5063, 38)
X_test = pd.get_dummies(X_test, drop_first=True)
X_test.shape
(2026, 38)
X_val = pd.get_dummies(X_val, drop_first=True)
X_val.shape
(3038, 38)
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5063 entries, 2327 to 5525 Data columns (total 38 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 5063 non-null int64 1 Credit_Limit 5063 non-null float64 2 Total_Revolving_Bal 5063 non-null int64 3 Total_Amt_Chng_Q4_Q1 5063 non-null float64 4 Total_Trans_Ct 5063 non-null int64 5 Total_Ct_Chng_Q4_Q1 5063 non-null float64 6 Gender_M 5063 non-null uint8 7 Dependent_count_1 5063 non-null uint8 8 Dependent_count_2 5063 non-null uint8 9 Dependent_count_3 5063 non-null uint8 10 Dependent_count_4 5063 non-null uint8 11 Dependent_count_5 5063 non-null uint8 12 Marital_Status_Married 5063 non-null uint8 13 Marital_Status_Single 5063 non-null uint8 14 Income_Category_$40K - $60K 5063 non-null uint8 15 Income_Category_$60K - $80K 5063 non-null uint8 16 Income_Category_$80K - $120K 5063 non-null uint8 17 Income_Category_Less than $40K 5063 non-null uint8 18 Card_Category_Gold 5063 non-null uint8 19 Card_Category_Platinum 5063 non-null uint8 20 Card_Category_Silver 5063 non-null uint8 21 Total_Relationship_Count_2 5063 non-null uint8 22 Total_Relationship_Count_3 5063 non-null uint8 23 Total_Relationship_Count_4 5063 non-null uint8 24 Total_Relationship_Count_5 5063 non-null uint8 25 Total_Relationship_Count_6 5063 non-null uint8 26 Months_Inactive_12_mon_1 5063 non-null uint8 27 Months_Inactive_12_mon_2 5063 non-null uint8 28 Months_Inactive_12_mon_3 5063 non-null uint8 29 Months_Inactive_12_mon_4 5063 non-null uint8 30 Months_Inactive_12_mon_5 5063 non-null uint8 31 Months_Inactive_12_mon_6 5063 non-null uint8 32 Contacts_Count_12_mon_1 5063 non-null uint8 33 Contacts_Count_12_mon_2 5063 non-null uint8 34 Contacts_Count_12_mon_3 5063 non-null uint8 35 Contacts_Count_12_mon_4 5063 non-null uint8 36 Contacts_Count_12_mon_5 5063 non-null uint8 37 Contacts_Count_12_mon_6 5063 non-null uint8 dtypes: float64(3), int64(3), uint8(32) memory usage: 435.1 KB
The metric of interest is recall. The primary interest of the business is which customers attrited and are their features that can inform the business when a customer might leave. The models are classification models where attrited custoemrs are the positive outcome. Recall will measure the likelihood of, given that the customer churns, how well did the model predict the churn. Based on these positive outcomes, the model can then generate a list of features that were most important to determining that outcome.
## Function to calculate different metric scores of the model - Accuracy, Recall and Precision
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
M1_dtree = DecisionTreeClassifier(criterion='gini',class_weight={0:0.16,1:0.84},random_state=1)
M1_dtree.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.16, 1: 0.84}, random_state=1)
M1_dtree_model_train_perf=model_performance_classification_sklearn(M1_dtree, X_train, y_train)
print("Training performance \n",M1_dtree_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 1.0 1.0 1.0 1.0
M1_dtree_model_val_perf=model_performance_classification_sklearn(M1_dtree, X_val, y_val)
print("Validatiion performance \n",M1_dtree_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.890718 0.645492 0.664557 0.654886
Decision tree Model is over fitting; recall on validation set is low - 64%
# Fit the model on train
M2_LogReg = LogisticRegression(solver="liblinear", random_state=1)
M2_LogReg.fit(X_train, y_train)
LogisticRegression(random_state=1, solver='liblinear')
M2_LogReg_model_train_perf=model_performance_classification_sklearn(M2_LogReg, X_train, y_train)
print("Training performance \n",M2_LogReg_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.893541 0.472325 0.777328 0.587605
M2_LogReg_model_val_perf=model_performance_classification_sklearn(M2_LogReg, X_val, y_val)
print("Validatiion performance \n",M2_LogReg_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.892693 0.44877 0.793478 0.573298
Model is not overfitting but recall score is low at 45%
from sklearn.ensemble import BaggingClassifier
M3_bagging = BaggingClassifier(random_state=1)
M3_bagging.fit(X_train,y_train)
BaggingClassifier(random_state=1)
M3_bagging_model_train_perf=model_performance_classification_sklearn(M3_bagging, X_train, y_train)
print("Training performance \n",M3_bagging_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.990914 0.9508 0.992298 0.971106
M3_bagging_model_val_perf=model_performance_classification_sklearn(M3_bagging, X_val, y_val)
print("Validatiion performance \n",M3_bagging_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.914417 0.606557 0.813187 0.694836
Model is over fitting and recall score is low - 61%
M4_bagging_lr=BaggingClassifier(base_estimator=LogisticRegression(solver='liblinear',random_state=1,max_iter=1000),random_state=1)
M4_bagging_lr.fit(X_train,y_train)
BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000,
random_state=1,
solver='liblinear'),
random_state=1)
M4_bagging_lr_model_train_perf=model_performance_classification_sklearn(M4_bagging_lr, X_train, y_train)
print("Training performance \n",M4_bagging_lr_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.892356 0.444034 0.795154 0.56985
M4_bagging_lr_model_val_perf=model_performance_classification_sklearn(M4_bagging_lr, X_val, y_val)
print("Validatiion performance \n",M4_bagging_lr_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.890388 0.413934 0.811245 0.548168
Model is not overfitting however recall score is poor - 42%
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
M5_abc = AdaBoostClassifier(random_state=1)
M5_abc.fit(X_train,y_train)
AdaBoostClassifier(random_state=1)
M5_abc_model_train_perf=model_performance_classification_sklearn(M5_abc, X_train, y_train)
print("Training performance \n",M5_abc_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.928106 0.710947 0.817539 0.760526
M5_abc_model_val_perf=model_performance_classification_sklearn(M5_abc, X_val, y_val)
print("Validatiion performance \n",M5_abc_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.923305 0.682377 0.810219 0.740823
Model is not overfitting however recall score is improved from previous models - 68%
M6_gbc = GradientBoostingClassifier(random_state=1)
M6_gbc.fit(X_train,y_train)
GradientBoostingClassifier(random_state=1)
M6_gbc_model_train_perf=model_performance_classification_sklearn(M6_gbc, X_train, y_train)
print("Training performance \n",M6_gbc_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.941932 0.720787 0.897397 0.799454
M6_gbc_model_val_perf=model_performance_classification_sklearn(M6_gbc, X_val, y_val)
print("Validatiion performance \n",M6_gbc_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.926926 0.647541 0.863388 0.740047
Model is slightly overfitting and performing better than some previous models - recall is 65%
# training performance comparison
models_train_comp_df = pd.concat(
[M1_dtree_model_train_perf.T,M2_LogReg_model_train_perf.T, M3_bagging_model_train_perf.T,M4_bagging_lr_model_train_perf.T,
M5_abc_model_train_perf.T,M6_gbc_model_train_perf.T],
axis=1,
)
models_train_comp_df.columns = [
"Decision Tree", 'Logistic Regression',
"Bagging Classifier",
"Bagging with LR est",
"AdaBoost Classifier",
"Gradient Booster"]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Decision Tree | Logistic Regression | Bagging Classifier | Bagging with LR est | AdaBoost Classifier | Gradient Booster | |
|---|---|---|---|---|---|---|
| Accuracy | 1.0 | 0.893541 | 0.990914 | 0.892356 | 0.928106 | 0.941932 |
| Recall | 1.0 | 0.472325 | 0.950800 | 0.444034 | 0.710947 | 0.720787 |
| Precision | 1.0 | 0.777328 | 0.992298 | 0.795154 | 0.817539 | 0.897397 |
| F1 | 1.0 | 0.587605 | 0.971106 | 0.569850 | 0.760526 | 0.799454 |
# vlaidation performance comparison
models_val_comp_df = pd.concat(
[M1_dtree_model_val_perf.T,M2_LogReg_model_val_perf.T, M3_bagging_model_val_perf.T,M4_bagging_lr_model_val_perf.T,
M5_abc_model_val_perf.T,M6_gbc_model_val_perf.T],
axis=1,
)
models_val_comp_df.columns = [
"Decision Tree", 'Logistic Regression',
"Bagging Classifier",
"Bagging with LR est",
"AdaBoost Classifier",
"Gradient Booster"]
print("Validation performance comparison:")
models_val_comp_df
Validation performance comparison:
| Decision Tree | Logistic Regression | Bagging Classifier | Bagging with LR est | AdaBoost Classifier | Gradient Booster | |
|---|---|---|---|---|---|---|
| Accuracy | 0.890718 | 0.892693 | 0.914417 | 0.890388 | 0.923305 | 0.926926 |
| Recall | 0.645492 | 0.448770 | 0.606557 | 0.413934 | 0.682377 | 0.647541 |
| Precision | 0.664557 | 0.793478 | 0.813187 | 0.811245 | 0.810219 | 0.863388 |
| F1 | 0.654886 | 0.573298 | 0.694836 | 0.548168 | 0.740823 | 0.740047 |
Of the first six models, the best performing models in terms of recall score and NOT overfitting are AdaBoost Classifier and Gradient Booster
# # Fit SMOTE on train data(Synthetic Minority Oversampling Technique)
sm = SMOTE(sampling_strategy=0.4, k_neighbors=5, random_state=1)
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("Before OverSampling, count of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, count of label '0': {} \n".format(sum(y_train == 0)))
print("After OverSampling, count of label '1': {}".format(sum(y_train_over == 1)))
print("After OverSampling, count of label '0': {} \n".format(sum(y_train_over == 0)))
print("After OverSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After OverSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before OverSampling, count of label '1': 813 Before OverSampling, count of label '0': 4250 After OverSampling, count of label '1': 1700 After OverSampling, count of label '0': 4250 After OverSampling, the shape of train_X: (5950, 38) After OverSampling, the shape of train_y: (5950,)
M7_dtree_over = DecisionTreeClassifier(criterion='gini',random_state=1)
M7_dtree_over.fit(X_train_over, y_train_over)
DecisionTreeClassifier(random_state=1)
M7_dtree_over_model_train_perf=model_performance_classification_sklearn(M7_dtree_over, X_train_over, y_train_over)
print("Training performance \nover",M7_dtree_over_model_train_perf)
Training performance over Accuracy Recall Precision F1 0 1.0 1.0 1.0 1.0
M7_dtree_over_model_val_perf=model_performance_classification_sklearn(M7_dtree_over, X_val, y_val)
print("Validatiion performance \n",M7_dtree_over_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.872943 0.651639 0.595506 0.622309
Model is overfitting; recall performance - 65%
M8_LogReg_over = LogisticRegression(solver="liblinear", random_state=1)
M8_LogReg_over.fit(X_train_over, y_train_over)
LogisticRegression(random_state=1, solver='liblinear')
M8_LogReg_over_model_train_perf=model_performance_classification_sklearn(M8_LogReg_over, X_train_over, y_train_over)
print("Training performance \n",M8_LogReg_over_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.898151 0.737059 0.887394 0.80527
M8_LogReg_over_model_val_perf=model_performance_classification_sklearn(M8_LogReg_over, X_val, y_val)
print("Validatiion performance \n",M8_LogReg_over_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.889072 0.502049 0.722714 0.592503
Model is overfitting; recall is poor: 50%
M9_bagging_over = BaggingClassifier(random_state=1)
M9_bagging_over.fit(X_train_over,y_train_over)
BaggingClassifier(random_state=1)
M9_bagging_over_model_train_perf=model_performance_classification_sklearn(M9_bagging_over, X_train_over, y_train_over)
print("Training performance \n",M9_bagging_over_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.995462 0.987647 0.996439 0.992024
M9_bagging_over_model_val_perf=model_performance_classification_sklearn(M9_bagging_over, X_val, y_val)
print("Validatiion performance \n",M9_bagging_over_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.912113 0.631148 0.779747 0.697622
Model is overfitting: Recall score is poor - 63%
M10_bagging_lr_over=BaggingClassifier(base_estimator=LogisticRegression(solver='liblinear',random_state=1,max_iter=1000),random_state=1)
M10_bagging_lr_over.fit(X_train_over,y_train_over)
BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000,
random_state=1,
solver='liblinear'),
random_state=1)
M10_bagging_lr_over_model_train_perf=model_performance_classification_sklearn(M10_bagging_lr_over, X_train_over, y_train_over)
print("Training performance \n",M10_bagging_lr_over_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.899328 0.730588 0.898048 0.805709
M10_bagging_lr_over_model_val_perf=model_performance_classification_sklearn(M10_bagging_lr_over, X_val, y_val)
print("Validatiion performance \n",M10_bagging_lr_over_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.890388 0.479508 0.747604 0.58427
Model is overfitting and Recall score is low - 48%
M11_abc_over = AdaBoostClassifier(random_state=1)
M11_abc_over.fit(X_train_over,y_train_over)
AdaBoostClassifier(random_state=1)
M11_abc_over_model_train_perf=model_performance_classification_sklearn(M11_abc_over, X_train_over, y_train_over)
print("Training performance \n",M11_abc_over_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.931597 0.859412 0.896869 0.877741
M11_abc_over_model_val_perf=model_performance_classification_sklearn(M11_abc_over, X_val, y_val)
print("Validatiion performance \n",M11_abc_over_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.923634 0.706967 0.794931 0.748373
Model is slight overfitting; recall score is improved to 71%
M12_gbc_over = GradientBoostingClassifier(random_state=1)
M12_gbc_over.fit(X_train_over,y_train_over)
GradientBoostingClassifier(random_state=1)
M12_gbc_over_model_train_perf=model_performance_classification_sklearn(M12_gbc_over, X_train_over, y_train_over)
print("Training performance \n",M12_gbc_over_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.95042 0.877647 0.944902 0.910034
M12_gbc_over_model_val_perf=model_performance_classification_sklearn(M12_gbc_over, X_val, y_val)
print("Validation performance \n",M12_gbc_over_model_val_perf)
Validation performance
Accuracy Recall Precision F1
0 0.932192 0.709016 0.843902 0.770601
Model is slightly over fitting; recall is fair at 71%
# training performance comparison
models_over_train_comp_df = pd.concat(
[M7_dtree_over_model_train_perf.T,M8_LogReg_over_model_train_perf.T, M9_bagging_over_model_train_perf.T,M10_bagging_lr_over_model_train_perf.T,
M11_abc_over_model_train_perf.T,M12_gbc_over_model_train_perf.T],
axis=1,
)
models_over_train_comp_df.columns = [
"Decision Tree over", 'Logistic Regression over',
"Bagging Classifier over",
"Bagging with LR est over",
"AdaBoost Classifier over",
"Gradient Booster over"]
print("Oversampling:Training performance comparison:")
models_over_train_comp_df
Oversampling:Training performance comparison:
| Decision Tree over | Logistic Regression over | Bagging Classifier over | Bagging with LR est over | AdaBoost Classifier over | Gradient Booster over | |
|---|---|---|---|---|---|---|
| Accuracy | 1.0 | 0.898151 | 0.995462 | 0.899328 | 0.931597 | 0.950420 |
| Recall | 1.0 | 0.737059 | 0.987647 | 0.730588 | 0.859412 | 0.877647 |
| Precision | 1.0 | 0.887394 | 0.996439 | 0.898048 | 0.896869 | 0.944902 |
| F1 | 1.0 | 0.805270 | 0.992024 | 0.805709 | 0.877741 | 0.910034 |
# validation performance comparison
models_over_val_comp_df = pd.concat(
[M7_dtree_over_model_val_perf.T,M8_LogReg_over_model_val_perf.T, M9_bagging_over_model_val_perf.T,M10_bagging_lr_over_model_val_perf.T,
M11_abc_over_model_val_perf.T,M12_gbc_over_model_val_perf.T],
axis=1,
)
models_over_val_comp_df.columns = [
"Decision Tree over", 'Logistic Regression over',
"Bagging Classifier over",
"Bagging with LR est over",
"AdaBoost Classifier over",
"Gradient Booster over"]
print("Oversampling: Validation performance comparison:")
models_over_val_comp_df
Oversampling: Validation performance comparison:
| Decision Tree over | Logistic Regression over | Bagging Classifier over | Bagging with LR est over | AdaBoost Classifier over | Gradient Booster over | |
|---|---|---|---|---|---|---|
| Accuracy | 0.872943 | 0.889072 | 0.912113 | 0.890388 | 0.923634 | 0.932192 |
| Recall | 0.651639 | 0.502049 | 0.631148 | 0.479508 | 0.706967 | 0.709016 |
| Precision | 0.595506 | 0.722714 | 0.779747 | 0.747604 | 0.794931 | 0.843902 |
| F1 | 0.622309 | 0.592503 | 0.697622 | 0.584270 | 0.748373 | 0.770601 |
# fit random under sampler on the train data
rus = RandomUnderSampler(random_state=1, sampling_strategy = 1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, count of label '1': {}".format(sum(y_train == 1)))
print("Before Under Sampling, count of label '0': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, count of label '1': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, count of label '0': {} \n".format(sum(y_train_un == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Under Sampling, count of label '1': 813 Before Under Sampling, count of label '0': 4250 After Under Sampling, count of label '1': 813 After Under Sampling, count of label '0': 813 After Under Sampling, the shape of train_X: (1626, 38) After Under Sampling, the shape of train_y: (1626,)
M13_dtree_under = DecisionTreeClassifier(criterion='gini',random_state=1)
M13_dtree_under.fit(X_train_un, y_train_un)
DecisionTreeClassifier(random_state=1)
M13_dtree_under_model_train_perf=model_performance_classification_sklearn(M13_dtree_under, X_train_un, y_train_un)
print("Training performance \nover",M13_dtree_under_model_train_perf)
Training performance over Accuracy Recall Precision F1 0 1.0 1.0 1.0 1.0
M13_dtree_under_model_val_perf=model_performance_classification_sklearn(M13_dtree_under, X_val, y_val)
print("Validatiion performance \n",M13_dtree_under_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.820935 0.805328 0.466746 0.590977
Model is overfiting; recall is improved from previosu decision trees models - 81%
M14_LogReg_under = LogisticRegression(solver="liblinear", random_state=1)
M14_LogReg_under.fit(X_train_un, y_train_un)
LogisticRegression(random_state=1, solver='liblinear')
M14_LogReg_under_model_train_perf=model_performance_classification_sklearn(M14_LogReg_under, X_train_un, y_train_un)
print("Training performance \n",M14_LogReg_under_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.851169 0.848708 0.852905 0.850801
M14_LogReg_under_model_val_perf=model_performance_classification_sklearn(M14_LogReg_under, X_val, y_val)
print("Validatiion performance \n",M14_LogReg_under_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.838381 0.805328 0.498099 0.615505
Model is very slightly overfitting; recall is much improved from previous models - 81%
M15_bagging_under = BaggingClassifier(random_state=1)
M15_bagging_under.fit(X_train_un,y_train_un)
BaggingClassifier(random_state=1)
M15_bagging_under_model_train_perf=model_performance_classification_sklearn(M15_bagging_under, X_train_un, y_train_un)
print("Training performance \n",M15_bagging_under_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.989545 0.98032 0.998747 0.989448
M15_bagging_under_model_val_perf=model_performance_classification_sklearn(M15_bagging_under, X_val, y_val)
print("Validatiion performance \n",M15_bagging_under_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.882818 0.821721 0.598507 0.692573
Model is overfitting; recal score is fair - 82%
M16_bagging_lr_under=BaggingClassifier(base_estimator=LogisticRegression(solver='liblinear',random_state=1,max_iter=1000),random_state=1)
M16_bagging_lr_under.fit(X_train_un,y_train_un)
BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000,
random_state=1,
solver='liblinear'),
random_state=1)
M16_bagging_lr_under_model_train_perf=model_performance_classification_sklearn(M16_bagging_lr_under, X_train_un, y_train_un)
print("Training performance \n",M16_bagging_lr_under_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.844403 0.832718 0.852645 0.842564
M16_bagging_lr_under_model_val_perf=model_performance_classification_sklearn(M16_bagging_lr_under, X_val, y_val)
print("Validatiion performance \n",M16_bagging_lr_under_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.84233 0.809426 0.505762 0.622537
Model is not overfitting; Recall score is fair - 81%
M17_abc_under = AdaBoostClassifier(random_state=1)
M17_abc_under.fit(X_train_un,y_train_un)
AdaBoostClassifier(random_state=1)
M17_abc_under_model_train_perf=model_performance_classification_sklearn(M17_abc_under, X_train_un, y_train_un)
print("Training performance \n",M17_abc_under_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.878844 0.879459 0.878378 0.878918
M17_abc_under_model_val_perf=model_performance_classification_sklearn(M17_abc_under, X_val, y_val)
print("Validatiion performance \n",M17_abc_under_model_val_perf)
Validatiion performance
Accuracy Recall Precision F1
0 0.882488 0.885246 0.589359 0.707617
Model is not overfitting; Recall score is good - 89%
M18_gbc_under = GradientBoostingClassifier(random_state=1)
M18_gbc_under.fit(X_train_un,y_train_un)
GradientBoostingClassifier(random_state=1)
M18_gbc_under_model_train_perf=model_performance_classification_sklearn(M18_gbc_under, X_train_un, y_train_un)
print("Training performance \n",M18_gbc_under_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.929889 0.936039 0.924666 0.930318
M18_gbc_under_model_val_perf=model_performance_classification_sklearn(M18_gbc_under, X_val, y_val)
print("Validation performance \n",M18_gbc_under_model_val_perf)
Validation performance
Accuracy Recall Precision F1
0 0.891376 0.872951 0.613833 0.720812
Model is slightly overfitting; recall score is good - 87%
# training performance comparison
models_under_train_comp_df = pd.concat(
[M13_dtree_under_model_train_perf.T,M14_LogReg_under_model_train_perf.T, M15_bagging_under_model_train_perf.T,M16_bagging_lr_under_model_train_perf.T,
M17_abc_under_model_train_perf.T,M18_gbc_under_model_train_perf.T],
axis=1,
)
models_under_train_comp_df.columns = [
"Decision Tree under", 'Logistic Regression under',
"Bagging Classifier under",
"Bagging with LR est under",
"AdaBoost Classifier under",
"Gradient Booster under"]
print("Undersampling:Training performance comparison:")
models_under_train_comp_df
Undersampling:Training performance comparison:
| Decision Tree under | Logistic Regression under | Bagging Classifier under | Bagging with LR est under | AdaBoost Classifier under | Gradient Booster under | |
|---|---|---|---|---|---|---|
| Accuracy | 1.0 | 0.851169 | 0.989545 | 0.844403 | 0.878844 | 0.929889 |
| Recall | 1.0 | 0.848708 | 0.980320 | 0.832718 | 0.879459 | 0.936039 |
| Precision | 1.0 | 0.852905 | 0.998747 | 0.852645 | 0.878378 | 0.924666 |
| F1 | 1.0 | 0.850801 | 0.989448 | 0.842564 | 0.878918 | 0.930318 |
# validation performance comparison
models_under_val_comp_df = pd.concat(
[M13_dtree_under_model_val_perf.T,M14_LogReg_under_model_val_perf.T, M15_bagging_under_model_val_perf.T,M16_bagging_lr_under_model_val_perf.T,
M17_abc_under_model_val_perf.T,M18_gbc_under_model_val_perf.T],
axis=1,
)
models_under_val_comp_df.columns = [
"Decision Tree under", 'Logistic Regression under',
"Bagging Classifier under",
"Bagging with LR est under",
"AdaBoost Classifier under",
"Gradient Booster under"]
print("Oversampling: Validation performance comparison:")
models_under_val_comp_df
Oversampling: Validation performance comparison:
| Decision Tree under | Logistic Regression under | Bagging Classifier under | Bagging with LR est under | AdaBoost Classifier under | Gradient Booster under | |
|---|---|---|---|---|---|---|
| Accuracy | 0.820935 | 0.838381 | 0.882818 | 0.842330 | 0.882488 | 0.891376 |
| Recall | 0.805328 | 0.805328 | 0.821721 | 0.809426 | 0.885246 | 0.872951 |
| Precision | 0.466746 | 0.498099 | 0.598507 | 0.505762 | 0.589359 | 0.613833 |
| F1 | 0.590977 | 0.615505 | 0.692573 | 0.622537 | 0.707617 | 0.720812 |
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Decision Tree | Logistic Regression | Bagging Classifier | Bagging with LR est | AdaBoost Classifier | Gradient Booster | |
|---|---|---|---|---|---|---|
| Accuracy | 1.0 | 0.893541 | 0.990914 | 0.892356 | 0.928106 | 0.941932 |
| Recall | 1.0 | 0.472325 | 0.950800 | 0.444034 | 0.710947 | 0.720787 |
| Precision | 1.0 | 0.777328 | 0.992298 | 0.795154 | 0.817539 | 0.897397 |
| F1 | 1.0 | 0.587605 | 0.971106 | 0.569850 | 0.760526 | 0.799454 |
print("Validation performance comparison:")
models_val_comp_df
Validation performance comparison:
| Decision Tree | Logistic Regression | Bagging Classifier | Bagging with LR est | AdaBoost Classifier | Gradient Booster | |
|---|---|---|---|---|---|---|
| Accuracy | 0.890718 | 0.892693 | 0.914417 | 0.890388 | 0.923305 | 0.926926 |
| Recall | 0.645492 | 0.448770 | 0.606557 | 0.413934 | 0.682377 | 0.647541 |
| Precision | 0.664557 | 0.793478 | 0.813187 | 0.811245 | 0.810219 | 0.863388 |
| F1 | 0.654886 | 0.573298 | 0.694836 | 0.548168 | 0.740823 | 0.740047 |
print("Oversampling:Training performance comparison:")
models_over_train_comp_df
Oversampling:Training performance comparison:
| Decision Tree over | Logistic Regression over | Bagging Classifier over | Bagging with LR est over | AdaBoost Classifier over | Gradient Booster over | |
|---|---|---|---|---|---|---|
| Accuracy | 1.0 | 0.898151 | 0.995462 | 0.899328 | 0.931597 | 0.950420 |
| Recall | 1.0 | 0.737059 | 0.987647 | 0.730588 | 0.859412 | 0.877647 |
| Precision | 1.0 | 0.887394 | 0.996439 | 0.898048 | 0.896869 | 0.944902 |
| F1 | 1.0 | 0.805270 | 0.992024 | 0.805709 | 0.877741 | 0.910034 |
print("Oversampling: Validation performance comparison:")
models_over_val_comp_df
Oversampling: Validation performance comparison:
| Decision Tree over | Logistic Regression over | Bagging Classifier over | Bagging with LR est over | AdaBoost Classifier over | Gradient Booster over | |
|---|---|---|---|---|---|---|
| Accuracy | 0.872943 | 0.889072 | 0.912113 | 0.890388 | 0.923634 | 0.932192 |
| Recall | 0.651639 | 0.502049 | 0.631148 | 0.479508 | 0.706967 | 0.709016 |
| Precision | 0.595506 | 0.722714 | 0.779747 | 0.747604 | 0.794931 | 0.843902 |
| F1 | 0.622309 | 0.592503 | 0.697622 | 0.584270 | 0.748373 | 0.770601 |
print("Undersampling:Training performance comparison:")
models_under_train_comp_df
Undersampling:Training performance comparison:
| Decision Tree under | Logistic Regression under | Bagging Classifier under | Bagging with LR est under | AdaBoost Classifier under | Gradient Booster under | |
|---|---|---|---|---|---|---|
| Accuracy | 1.0 | 0.851169 | 0.989545 | 0.844403 | 0.878844 | 0.929889 |
| Recall | 1.0 | 0.848708 | 0.980320 | 0.832718 | 0.879459 | 0.936039 |
| Precision | 1.0 | 0.852905 | 0.998747 | 0.852645 | 0.878378 | 0.924666 |
| F1 | 1.0 | 0.850801 | 0.989448 | 0.842564 | 0.878918 | 0.930318 |
print("Undersampling: Validation performance comparison:")
models_under_val_comp_df
Undersampling: Validation performance comparison:
| Decision Tree under | Logistic Regression under | Bagging Classifier under | Bagging with LR est under | AdaBoost Classifier under | Gradient Booster under | |
|---|---|---|---|---|---|---|
| Accuracy | 0.820935 | 0.838381 | 0.882818 | 0.842330 | 0.882488 | 0.891376 |
| Recall | 0.805328 | 0.805328 | 0.821721 | 0.809426 | 0.885246 | 0.872951 |
| Precision | 0.466746 | 0.498099 | 0.598507 | 0.505762 | 0.589359 | 0.613833 |
| F1 | 0.590977 | 0.615505 | 0.692573 | 0.622537 | 0.707617 | 0.720812 |
1) Logistic Regression Undersampling: Training Recall 85%, Validation Recall 81% - Good performance and model is not overfitting 2) Adaboost Classifier Undersampling: Training Recall 88% Validation Recall 89% - Good performance and not overfitting 3) Gradient Booster Undersampling: Training Recall 94% Validation Recall 87% - Good performance and slight overfitting
from sklearn.model_selection import RandomizedSearchCV
### Hyper Parameter Tuning - Logistic Regression (https://www.youtube.com/watch?v=pooXM9mM7FU)
param_grid_0 = [{'penalty':['l1', 'l2','elasticnet','none'],'C':np.logspace(-4,4,20),'solver':['lbfgs','newton-cg','liblinear','sag','saga'],'max_iter':[100,1000,2500,5000]}]
scorer = metrics.make_scorer(metrics.recall_score)
randomized_cv_lr = RandomizedSearchCV(estimator=M14_LogReg_under, param_distributions=param_grid_0, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
randomized_cv_lr.fit(X_train_un, y_train_un)
print(
"Best Parameters:{} \nScore: {}".format(randomized_cv_lr.best_params_, randomized_cv_lr.best_score_)
)
Best Parameters:{'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 100, 'C': 545.5594781168514}
Score: 0.8462243429523593
# Set the clf to the best combination of parameters
lr_tuned = randomized_cv_lr.best_estimator_
# Fit the best algorithm to the data.
lr_tuned.fit(X_train_un, y_train_un)
LogisticRegression(C=545.5594781168514, penalty='l1', random_state=1,
solver='liblinear')
lr_tuned_under_model_train_perf=model_performance_classification_sklearn(lr_tuned, X_train_un, y_train_un)
print("Training performance \n",lr_tuned_under_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.859164 0.857319 0.860494 0.858903
lr_tuned_under_model_val_perf=model_performance_classification_sklearn(lr_tuned, X_val, y_val)
print("Validation performance \n",lr_tuned_under_model_val_perf)
Validation performance
Accuracy Recall Precision F1
0 0.847926 0.795082 0.517333 0.626817
# Parameter grid to pass in Random Search
param_grid = {
"n_estimators": np.arange(10, 110, 10),
"learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
randomized_cv = RandomizedSearchCV(estimator=M17_abc_under, param_distributions=param_grid, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
# Fitting parameter
randomized_cv.fit(X_train_un, y_train_un)
print(
"Best Parameters:{} \nScore: {}".format(randomized_cv.best_params_, randomized_cv.best_score_)
)
Best Parameters:{'n_estimators': 60, 'learning_rate': 0.05, 'base_estimator': DecisionTreeClassifier(max_depth=3, random_state=1)}
Score: 0.8879951526168295
# building model with best parameters
adb_tuned = AdaBoostClassifier(
n_estimators=60,
learning_rate=.05,
random_state=1,
base_estimator=DecisionTreeClassifier(max_depth=3, random_state=1),
)
# Fit the model on training data
adb_tuned.fit(X_train_un, y_train_un)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
random_state=1),
learning_rate=0.05, n_estimators=60, random_state=1)
adb_tuned_under_model_train_perf=model_performance_classification_sklearn(adb_tuned, X_train_un, y_train_un)
print("Training performance \n",adb_tuned_under_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.907134 0.924969 0.893112 0.908761
adb_tuned_under_model_val_perf=model_performance_classification_sklearn(adb_tuned, X_val, y_val)
print("Validation performance \n",adb_tuned_under_model_val_perf)
Validation performance
Accuracy Recall Precision F1
0 0.876893 0.881148 0.576408 0.696921
# Grid of parameters to choose from
## add from article
param_grid_2 = {
"n_estimators": [100,150,200,250],
"subsample":[0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1]
}
scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
randomized_cv_gb = RandomizedSearchCV(M18_gbc_under, param_grid_2, scoring=scorer,cv=5)
# Fitting parameters in GridSearchCV
randomized_cv_gb.fit(X_train_un, y_train_un)
print(
"Best Parameters:{} \nScore: {}".format(randomized_cv_gb.best_params_, randomized_cv_gb.best_score_)
)
Best Parameters:{'subsample': 0.8, 'n_estimators': 150, 'max_features': 0.7}
Score: 0.8880254487616451
# Set the clf to the best combination of parameters
gbc_tuned = randomized_cv_gb.best_estimator_
# Fit the best algorithm to the data.
gbc_tuned.fit(X_train_un, y_train_un)
GradientBoostingClassifier(max_features=0.7, n_estimators=150, random_state=1,
subsample=0.8)
gbc_tuned_under_model_train_perf=model_performance_classification_sklearn(gbc_tuned, X_train_un, y_train_un)
print("Training performance \n",gbc_tuned_under_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.955105 0.95818 0.952323 0.955242
gbc_tuned_under_model_val_perf=model_performance_classification_sklearn(gbc_tuned, X_val, y_val)
print("Validation performance \n",gbc_tuned_under_model_val_perf)
Validation performance
Accuracy Recall Precision F1
0 0.90553 0.893443 0.649776 0.752373
1) Tuned Logistic Regression Undersampling: Training Recall 86%, Validation Recall 80% - Good performance and model is not overfitting 2) Tuned Adaboost Classifier Undersampling: Training Recall 93% Validation Recall 89% - Good performance and not overfitting 3) Tuned Gradient Booster Undersampling: Training Recall 96% Validation Recall 89% - Good performance and slight overfitting
The model I choose is the Tuned Adaboost Classifier Undersampling model with training recall 93% and Validation recall 89%; the model is not overfitting and the recall score indicated good performance.
adb_tuned_model_test_perf=model_performance_classification_sklearn(adb_tuned, X_test, y_test)
print("Testing performance \n",adb_tuned_model_test_perf)
Testing performance
Accuracy Recall Precision F1
0 0.87463 0.880368 0.571713 0.693237
#####The recall of the test data is approx equal to the recal of the validation data and slightly lower than the recall on the training data. The mode is not over fitting.
print (pd.DataFrame(adb_tuned.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by = 'Imp', ascending = False))
Imp Total_Trans_Ct 2.445325e-01 Total_Revolving_Bal 1.597593e-01 Total_Ct_Chng_Q4_Q1 1.273364e-01 Months_Inactive_12_mon_1 8.352526e-02 Total_Amt_Chng_Q4_Q1 8.099159e-02 Total_Relationship_Count_2 7.245291e-02 Credit_Limit 7.228382e-02 Gender_M 2.557404e-02 Contacts_Count_12_mon_1 1.945846e-02 Customer_Age 1.891801e-02 Contacts_Count_12_mon_3 1.798185e-02 Total_Relationship_Count_6 1.748509e-02 Income_Category_$60K - $80K 1.619113e-02 Months_Inactive_12_mon_3 1.415810e-02 Total_Relationship_Count_5 8.938518e-03 Contacts_Count_12_mon_6 7.787545e-03 Total_Relationship_Count_4 5.603433e-03 Dependent_count_2 2.692930e-03 Marital_Status_Single 2.200285e-03 Months_Inactive_12_mon_4 1.133682e-03 Contacts_Count_12_mon_4 9.951152e-04 Months_Inactive_12_mon_6 1.625599e-16 Dependent_count_3 2.773149e-18 Income_Category_$80K - $120K 0.000000e+00 Contacts_Count_12_mon_5 0.000000e+00 Dependent_count_1 0.000000e+00 Contacts_Count_12_mon_2 0.000000e+00 Months_Inactive_12_mon_5 0.000000e+00 Months_Inactive_12_mon_2 0.000000e+00 Income_Category_Less than $40K 0.000000e+00 Dependent_count_4 0.000000e+00 Dependent_count_5 0.000000e+00 Marital_Status_Married 0.000000e+00 Total_Relationship_Count_3 0.000000e+00 Card_Category_Silver 0.000000e+00 Income_Category_$40K - $60K 0.000000e+00 Card_Category_Gold 0.000000e+00 Card_Category_Platinum 0.000000e+00
feature_names = X_train.columns
importances =adb_tuned.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
pipe_data = data.copy()
# separating data into X and Y
X_pipe = pipe_data.drop(['Attrition_Flag'], axis = 1)
y_pipe = pipe_data['Attrition_Flag']
# dividing data into train and test sets
X_pipe_train, X_pipe_test, y_pipe_train, y_pipe_test = train_test_split(X_pipe, y_pipe, test_size=0.30, random_state=1, stratify = y)
rus = RandomUnderSampler(random_state=1, sampling_strategy = 1)
X_train_pipe_un, y_train_pipe_un = rus.fit_resample(X_pipe_train, y_pipe_train)
X_train_pipe_un = pd.get_dummies(X_train_pipe_un, drop_first=True)
X_train_pipe_un.shape
(2278, 38)
X_pipe_test = pd.get_dummies(X_pipe_test, drop_first=True)
X_pipe_test.shape
(3039, 38)
#Replace the outliers with Q3 for outliers above the median
def impute_outliers(df):
X_train_pipe_un['Total_Ct_Chng_Q4_Q1'].mask(X_train_pipe_un['Total_Ct_Chng_Q4_Q1'] > 1.8,X_train_pipe_un['Total_Ct_Chng_Q4_Q1'].quantile(.75), inplace=True)
X_pipe_test['Total_Ct_Chng_Q4_Q1'].mask(X_pipe_test['Total_Ct_Chng_Q4_Q1'] > 1.8,X_pipe_test['Total_Ct_Chng_Q4_Q1'].quantile(.75), inplace=True)
X_train_pipe_un['Total_Amt_Chng_Q4_Q1'].mask(X_train_pipe_un['Total_Amt_Chng_Q4_Q1'] > 1.8,X_train_pipe_un['Total_Amt_Chng_Q4_Q1'].quantile(.75), inplace=True)
X_pipe_test['Total_Amt_Chng_Q4_Q1'].mask(X_pipe_test['Total_Amt_Chng_Q4_Q1'] > 1.8,X_pipe_test['Total_Amt_Chng_Q4_Q1'].quantile(.75), inplace=True)
return df
# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
processing = FunctionTransformer(impute_outliers)
pipe = Pipeline(
steps=[
("data_processing", processing),
("AdaBoost",
AdaBoostClassifier(
n_estimators=60,
learning_rate=.05,
random_state=1,
base_estimator=DecisionTreeClassifier(max_depth=3, random_state=1),
),)])
# # Fit the model on training data
pipe.fit(X_train_pipe_un, y_train_pipe_un)
Pipeline(steps=[('data_processing',
FunctionTransformer(func=<function impute_outliers at 0x00000192DE683040>)),
('AdaBoost',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
random_state=1),
learning_rate=0.05, n_estimators=60,
random_state=1))])
adb_pipe_under_model_train_perf=model_performance_classification_sklearn(pipe,X_train_pipe_un, y_train_pipe_un)
print("Training performance \n",adb_pipe_under_model_train_perf)
Training performance
Accuracy Recall Precision F1
0 0.893327 0.90518 0.88422 0.894577
adb_pipe_tuned_model_test_perf=model_performance_classification_sklearn(pipe, X_pipe_test, y_pipe_test)
print("Testing performance \n",adb_pipe_tuned_model_test_perf)
Testing performance
Accuracy Recall Precision F1
0 0.866732 0.879098 0.553548 0.679335
The most important features with apparent thresholdsfor when customer attrit idnetified by both EDA (for the top 3) and Important feature post model analysis are:
Total transaction count - <80
Total Revolving balance - <10000
Total amount change ratio - < 1.1
Months Inactive (12)
Total amount change
Total relationahip count
Credit Limit
I would recommend to the business that they setup a program to monitor these characteristics for all accounts and setup a program for determining at-risk cusotmers and programs to address the risk Programs Program elements might include: 1) Personalized contact with at-risk customers, including emails and phone calls and other types of frequent communication 2) Setup a digital wallet program 3) Offer special rate financing to cusotmer
Other recommemdations Improve data collection techniques to ensure no missing data points and accurate data Model precision score is low - 55% - therefore the mode is not good at predciting customers who do not churn; consider creating a seperate model to focus on these customers (or improving this model) Recall score (88%) for the model could be improved through more data, more accurate data, or continue to work wiith other model types